You are currently looking at version 1.0 of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the Jupyter Notebook FAQ course resource.


The Series Data Structure


In [1]:
import pandas as pd
pd.Series?

In [2]:
animals = ['Tiger', 'Bear', 'Moose']
pd.Series(animals)


Out[2]:
0    Tiger
1     Bear
2    Moose
dtype: object

In [3]:
numbers = [1, 2, 3]
pd.Series(numbers)


Out[3]:
0    1
1    2
2    3
dtype: int64

In [36]:
animals = ['Tiger', 'Bear', None]
df = pd.Series(animals)
df['number_column'] = -99999
df


Out[36]:
0                 Tiger
1                  Bear
2                  None
number_column    -99999
dtype: object

In [5]:
numbers = [1, 2, None]
pd.Series(numbers)


Out[5]:
0    1.0
1    2.0
2    NaN
dtype: float64

In [6]:
import numpy as np
np.nan == None


Out[6]:
False

In [7]:
np.nan == np.nan


Out[7]:
False

In [8]:
np.isnan(np.nan)


Out[8]:
True

In [9]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s


Out[9]:
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [10]:
s.index


Out[10]:
Index(['Archery', 'Golf', 'Sumo', 'Taekwondo'], dtype='object')

In [11]:
s = pd.Series(['Tiger', 'Bear', 'Moose'], index=['India', 'America', 'Canada'])
s


Out[11]:
India      Tiger
America     Bear
Canada     Moose
dtype: object

In [12]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports, index=['Golf', 'Sumo', 'Hockey'])
s


Out[12]:
Golf      Scotland
Sumo         Japan
Hockey         NaN
dtype: object

Querying a Series


In [13]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s


Out[13]:
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [14]:
s.iloc[3]


Out[14]:
'South Korea'

In [15]:
s.loc['Golf']


Out[15]:
'Scotland'

In [16]:
s[3]


Out[16]:
'South Korea'

In [17]:
s['Golf']


Out[17]:
'Scotland'

In [38]:
sports = {99: 'Bhutan',
          100: 'Scotland',
          101: 'Japan',
          102: 'South Korea'}
s = pd.Series(sports)

In [39]:
s.iloc[0] #This won't call s.iloc[0] as one might expect, it generates an error instead


Out[39]:
'Bhutan'

In [40]:
s = pd.Series([100.00, 120.00, 101.00, 3.00])
s


Out[40]:
0    100.0
1    120.0
2    101.0
3      3.0
dtype: float64

In [41]:
total = 0
for item in s:
    total+=item
print(total)


324.0

In [23]:
import numpy as np

total = np.sum(s)
print(total)


324.0

In [43]:
#this creates a big series of random numbers
s = pd.Series(np.random.randint(0,1000,10000))
s.head()


Out[43]:
0    219
1    517
2    475
3    805
4    950
dtype: int64

In [44]:
len(s)


Out[44]:
10000

In [45]:
%%timeit -n 100
summary = 0
for item in s:
    summary+=item


100 loops, best of 3: 1.71 ms per loop

In [46]:
%%timeit -n 100
summary = np.sum(s)


100 loops, best of 3: 168 µs per loop

In [47]:
s+=2 #adds two to each item in s using broadcasting
s.head()


Out[47]:
0    221
1    519
2    477
3    807
4    952
dtype: int64

In [48]:
for label, value in s.iteritems():
    s.set_value(label, value+2)
s.head()


Out[48]:
0    223
1    521
2    479
3    809
4    954
dtype: int64

In [49]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
for label, value in s.iteritems():
    s.loc[label]= value+2


10 loops, best of 3: 1.58 s per loop

In [50]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
s+=2


10 loops, best of 3: 895 µs per loop

In [57]:
s = pd.Series([1, 2, 3])
s.loc['Animal'] = 'Bears'
s


Out[57]:
0             1
1             2
2             3
Animal    Bears
dtype: object

In [58]:
original_sports = pd.Series({'Archery': 'Bhutan',
                             'Golf': 'Scotland',
                             'Sumo': 'Japan',
                             'Taekwondo': 'South Korea'})
cricket_loving_countries = pd.Series(['Australia',
                                      'Barbados',
                                      'Pakistan',
                                      'England'], 
                                   index=['Cricket',
                                          'Cricket',
                                          'Cricket',
                                          'Cricket'])
all_countries = original_sports.append(cricket_loving_countries)

In [59]:
original_sports


Out[59]:
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [60]:
cricket_loving_countries


Out[60]:
Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object

In [61]:
all_countries


Out[61]:
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
Cricket        Australia
Cricket         Barbados
Cricket         Pakistan
Cricket          England
dtype: object

In [62]:
all_countries.loc['Cricket']


Out[62]:
Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object

The DataFrame Data Structure


In [4]:
import pandas as pd
purchase_1 = pd.Series({'Name': 'Chris',
                        'Item Purchased': 'Dog Food',
                        'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
                        'Item Purchased': 'Kitty Litter',
                        'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
                        'Item Purchased': 'Bird Seed',
                        'Cost': 5.00})
df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
df.head()


Out[4]:
Cost Item Purchased Name
Store 1 22.5 Dog Food Chris
Store 1 2.5 Kitty Litter Kevyn
Store 2 5.0 Bird Seed Vinod

In [65]:
df.loc['Store 2']


Out[65]:
Cost                      5
Item Purchased    Bird Seed
Name                  Vinod
Name: Store 2, dtype: object

In [66]:
type(df.loc['Store 2'])


Out[66]:
pandas.core.series.Series

In [67]:
df.loc['Store 1']


Out[67]:
Cost Item Purchased Name
Store 1 22.5 Dog Food Chris
Store 1 2.5 Kitty Litter Kevyn

In [68]:
df.loc['Store 1', 'Cost']


Out[68]:
Store 1    22.5
Store 1     2.5
Name: Cost, dtype: float64

In [69]:
df.T


Out[69]:
Store 1 Store 1 Store 2
Cost 22.5 2.5 5
Item Purchased Dog Food Kitty Litter Bird Seed
Name Chris Kevyn Vinod

In [70]:
df.T.loc['Cost']


Out[70]:
Store 1    22.5
Store 1     2.5
Store 2       5
Name: Cost, dtype: object

In [71]:
df['Cost']


Out[71]:
Store 1    22.5
Store 1     2.5
Store 2     5.0
Name: Cost, dtype: float64

In [72]:
df.loc['Store 1']['Cost']


Out[72]:
Store 1    22.5
Store 1     2.5
Name: Cost, dtype: float64

In [73]:
df.loc[:,['Name', 'Cost']]


Out[73]:
Name Cost
Store 1 Chris 22.5
Store 1 Kevyn 2.5
Store 2 Vinod 5.0

In [82]:
df[['Name', 'Cost']]


Out[82]:
Name Cost
Store 1 Chris 22.5
Store 1 Kevyn 2.5
Store 2 Vinod 5.0

In [74]:
df['Name']


Out[74]:
Store 1    Chris
Store 1    Kevyn
Store 2    Vinod
Name: Name, dtype: object

In [75]:
df.drop('Store 1')


Out[75]:
Cost Item Purchased Name
Store 2 5.0 Bird Seed Vinod

In [76]:
df


Out[76]:
Cost Item Purchased Name
Store 1 22.5 Dog Food Chris
Store 1 2.5 Kitty Litter Kevyn
Store 2 5.0 Bird Seed Vinod

In [77]:
copy_df = df.copy()
copy_df = copy_df.drop('Store 1')
copy_df


Out[77]:
Cost Item Purchased Name
Store 2 5.0 Bird Seed Vinod

In [78]:
copy_df.drop?

In [79]:
del copy_df['Name']
copy_df


Out[79]:
Cost Item Purchased
Store 2 5.0 Bird Seed

In [83]:
df['Location'] = None
df


Out[83]:
Cost Item Purchased Name Location
Store 1 22.5 Dog Food Chris None
Store 1 2.5 Kitty Litter Kevyn None
Store 2 5.0 Bird Seed Vinod None

Dataframe Indexing and Loading


In [3]:
costs = df['Cost']
costs


Out[3]:
Store 1    22.5
Store 1     2.5
Store 2     5.0
Name: Cost, dtype: float64

In [4]:
costs+=2
costs


Out[4]:
Store 1    24.5
Store 1     4.5
Store 2     7.0
Name: Cost, dtype: float64

In [5]:
df


Out[5]:
Cost Item Purchased Name
Store 1 24.5 Dog Food Chris
Store 1 4.5 Kitty Litter Kevyn
Store 2 7.0 Bird Seed Vinod

In [2]:
!cat olympics.csv


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
,№ Summer,01 !,02 !,03 !,Total,№ Winter,01 !,02 !,03 !,Total,№ Games,01 !,02 !,03 !,Combined total
Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12
Australasia (ANZ) [ANZ],2,3,4,5,12,0,0,0,0,0,2,3,4,5,12
Australia (AUS) [AUS] [Z],25,139,152,177,468,18,5,3,4,12,43,144,155,181,480
Austria (AUT),26,18,33,35,86,22,59,78,81,218,48,77,111,116,304
Azerbaijan (AZE),5,6,5,15,26,5,0,0,0,0,10,6,5,15,26
Bahamas (BAH),15,5,2,5,12,0,0,0,0,0,15,5,2,5,12
Bahrain (BRN),8,0,0,1,1,0,0,0,0,0,8,0,0,1,1
Barbados (BAR) [BAR],11,0,0,1,1,0,0,0,0,0,11,0,0,1,1
Belarus (BLR),5,12,24,39,75,6,6,4,5,15,11,18,28,44,90
Belgium (BEL),25,37,52,53,142,20,1,1,3,5,45,38,53,56,147
Bermuda (BER),17,0,0,1,1,7,0,0,0,0,24,0,0,1,1
Bohemia (BOH) [BOH] [Z],3,0,1,3,4,0,0,0,0,0,3,0,1,3,4
Botswana (BOT),9,0,1,0,1,0,0,0,0,0,9,0,1,0,1
Brazil (BRA),21,23,30,55,108,7,0,0,0,0,28,23,30,55,108
British West Indies (BWI) [BWI],1,0,0,2,2,0,0,0,0,0,1,0,0,2,2
Bulgaria (BUL) [H],19,51,85,78,214,19,1,2,3,6,38,52,87,81,220
Burundi (BDI),5,1,0,0,1,0,0,0,0,0,5,1,0,0,1
Cameroon (CMR),13,3,1,1,5,1,0,0,0,0,14,3,1,1,5
Canada (CAN),25,59,99,121,279,22,62,56,52,170,47,121,155,173,449
Chile (CHI) [I],22,2,7,4,13,16,0,0,0,0,38,2,7,4,13
China (CHN) [CHN],9,201,146,126,473,10,12,22,19,53,19,213,168,145,526
Colombia (COL),18,2,6,11,19,1,0,0,0,0,19,2,6,11,19
Costa Rica (CRC),14,1,1,2,4,6,0,0,0,0,20,1,1,2,4
Ivory Coast (CIV) [CIV],12,0,1,0,1,0,0,0,0,0,12,0,1,0,1
Croatia (CRO),6,6,7,10,23,7,4,6,1,11,13,10,13,11,34
Cuba (CUB) [Z],19,72,67,70,209,0,0,0,0,0,19,72,67,70,209
Cyprus (CYP),9,0,1,0,1,10,0,0,0,0,19,0,1,0,1
Czech Republic (CZE) [CZE],5,14,15,15,44,6,7,9,8,24,11,21,24,23,68
Czechoslovakia (TCH) [TCH],16,49,49,45,143,16,2,8,15,25,32,51,57,60,168
Denmark (DEN) [Z],26,43,68,68,179,13,0,1,0,1,39,43,69,68,180
Djibouti (DJI) [B],7,0,0,1,1,0,0,0,0,0,7,0,0,1,1
Dominican Republic (DOM),13,3,2,1,6,0,0,0,0,0,13,3,2,1,6
Ecuador (ECU),13,1,1,0,2,0,0,0,0,0,13,1,1,0,2
Egypt (EGY) [EGY] [Z],21,7,9,10,26,1,0,0,0,0,22,7,9,10,26
Eritrea (ERI),4,0,0,1,1,0,0,0,0,0,4,0,0,1,1
Estonia (EST),11,9,9,15,33,9,4,2,1,7,20,13,11,16,40
Ethiopia (ETH),12,21,7,17,45,2,0,0,0,0,14,21,7,17,45
Finland (FIN),24,101,84,117,302,22,42,62,57,161,46,143,146,174,463
France (FRA) [O] [P] [Z],27,202,223,246,671,22,31,31,47,109,49,233,254,293,780
Gabon (GAB),9,0,1,0,1,0,0,0,0,0,9,0,1,0,1
Georgia (GEO),5,6,5,14,25,6,0,0,0,0,11,6,5,14,25
Germany (GER) [GER] [Z],15,174,182,217,573,11,78,78,53,209,26,252,260,270,782
United Team of Germany (EUA) [EUA],3,28,54,36,118,3,8,6,5,19,6,36,60,41,137
East Germany (GDR) [GDR],5,153,129,127,409,6,39,36,35,110,11,192,165,162,519
West Germany (FRG) [FRG],5,56,67,81,204,6,11,15,13,39,11,67,82,94,243
Ghana (GHA) [GHA],13,0,1,3,4,1,0,0,0,0,14,0,1,3,4
Great Britain (GBR) [GBR] [Z],27,236,272,272,780,22,10,4,12,26,49,246,276,284,806
Greece (GRE) [Z],27,30,42,39,111,18,0,0,0,0,45,30,42,39,111
Grenada (GRN),8,1,0,0,1,0,0,0,0,0,8,1,0,0,1
Guatemala (GUA),13,0,1,0,1,1,0,0,0,0,14,0,1,0,1
Guyana (GUY) [GUY],16,0,0,1,1,0,0,0,0,0,16,0,0,1,1
Haiti (HAI) [J],14,0,1,1,2,0,0,0,0,0,14,0,1,1,2
Hong Kong (HKG) [HKG],15,1,1,1,3,4,0,0,0,0,19,1,1,1,3
Hungary (HUN),25,167,144,165,476,22,0,2,4,6,47,167,146,169,482
Iceland (ISL),19,0,2,2,4,17,0,0,0,0,36,0,2,2,4
India (IND) [F],23,9,6,11,26,9,0,0,0,0,32,9,6,11,26
Indonesia (INA),14,6,10,11,27,0,0,0,0,0,14,6,10,11,27
Iran (IRI) [K],15,15,20,25,60,10,0,0,0,0,25,15,20,25,60
Iraq (IRQ),13,0,0,1,1,0,0,0,0,0,13,0,0,1,1
Ireland (IRL),20,9,8,12,29,6,0,0,0,0,26,9,8,12,29
Israel (ISR),15,1,1,5,7,6,0,0,0,0,21,1,1,5,7
Italy (ITA) [M] [S],26,198,166,185,549,22,37,34,43,114,48,235,200,228,663
Jamaica (JAM) [JAM],16,17,30,20,67,7,0,0,0,0,23,17,30,20,67
Japan (JPN),21,130,126,142,398,20,10,17,18,45,41,140,143,160,443
Kazakhstan (KAZ),5,16,17,19,52,6,1,3,3,7,11,17,20,22,59
Kenya (KEN),13,25,32,29,86,3,0,0,0,0,16,25,32,29,86
North Korea (PRK),9,14,12,21,47,8,0,1,1,2,17,14,13,22,49
South Korea (KOR),16,81,82,80,243,17,26,17,10,53,33,107,99,90,296
Kuwait (KUW),12,0,0,2,2,0,0,0,0,0,12,0,0,2,2
Kyrgyzstan (KGZ),5,0,1,2,3,6,0,0,0,0,11,0,1,2,3
Latvia (LAT),10,3,11,5,19,10,0,4,3,7,20,3,15,8,26
Lebanon (LIB),16,0,2,2,4,16,0,0,0,0,32,0,2,2,4
Liechtenstein (LIE),16,0,0,0,0,18,2,2,5,9,34,2,2,5,9
Lithuania (LTU),8,6,5,10,21,8,0,0,0,0,16,6,5,10,21
Luxembourg (LUX) [O],22,1,1,0,2,8,0,2,0,2,30,1,3,0,4
Macedonia (MKD),5,0,0,1,1,5,0,0,0,0,10,0,0,1,1
Malaysia (MAS) [MAS],12,0,3,3,6,0,0,0,0,0,12,0,3,3,6
Mauritius (MRI),8,0,0,1,1,0,0,0,0,0,8,0,0,1,1
Mexico (MEX),22,13,21,28,62,8,0,0,0,0,30,13,21,28,62
Moldova (MDA),5,0,2,5,7,6,0,0,0,0,11,0,2,5,7
Mongolia (MGL),12,2,9,13,24,13,0,0,0,0,25,2,9,13,24
Montenegro (MNE),2,0,1,0,1,2,0,0,0,0,4,0,1,0,1
Morocco (MAR),13,6,5,11,22,6,0,0,0,0,19,6,5,11,22
Mozambique (MOZ),9,1,0,1,2,0,0,0,0,0,9,1,0,1,2
Namibia (NAM),6,0,4,0,4,0,0,0,0,0,6,0,4,0,4
Netherlands (NED) [Z],25,77,85,104,266,20,37,38,35,110,45,114,123,139,376
Netherlands Antilles (AHO) [AHO] [I],13,0,1,0,1,2,0,0,0,0,15,0,1,0,1
New Zealand (NZL) [NZL],22,42,18,39,99,15,0,1,0,1,37,42,19,39,100
Niger (NIG),11,0,0,1,1,0,0,0,0,0,11,0,0,1,1
Nigeria (NGR),15,3,8,12,23,0,0,0,0,0,15,3,8,12,23
Norway (NOR) [Q],24,56,49,43,148,22,118,111,100,329,46,174,160,143,477
Pakistan (PAK),16,3,3,4,10,2,0,0,0,0,18,3,3,4,10
Panama (PAN),16,1,0,2,3,0,0,0,0,0,16,1,0,2,3
Paraguay (PAR),11,0,1,0,1,1,0,0,0,0,12,0,1,0,1
Peru (PER) [L],17,1,3,0,4,2,0,0,0,0,19,1,3,0,4
Philippines (PHI),20,0,2,7,9,4,0,0,0,0,24,0,2,7,9
Poland (POL),20,64,82,125,271,22,6,7,7,20,42,70,89,132,291
Portugal (POR),23,4,8,11,23,7,0,0,0,0,30,4,8,11,23
Puerto Rico (PUR),17,0,2,6,8,6,0,0,0,0,23,0,2,6,8
Qatar (QAT),8,0,0,4,4,0,0,0,0,0,8,0,0,4,4
Romania (ROU),20,88,94,119,301,20,0,0,1,1,40,88,94,120,302
Russia (RUS) [RUS],5,132,121,142,395,6,49,40,35,124,11,181,161,177,519
Russian Empire (RU1) [RU1],3,1,4,3,8,0,0,0,0,0,3,1,4,3,8
Soviet Union (URS) [URS],9,395,319,296,1010,9,78,57,59,194,18,473,376,355,1204
Unified Team (EUN) [EUN],1,45,38,29,112,1,9,6,8,23,2,54,44,37,135
Saudi Arabia (KSA),10,0,1,2,3,0,0,0,0,0,10,0,1,2,3
Senegal (SEN),13,0,1,0,1,5,0,0,0,0,18,0,1,0,1
Serbia (SRB) [SRB],3,1,2,4,7,2,0,0,0,0,5,1,2,4,7
Serbia and Montenegro (SCG) [SCG],3,2,4,3,9,3,0,0,0,0,6,2,4,3,9
Singapore (SIN),15,0,2,2,4,0,0,0,0,0,15,0,2,2,4
Slovakia (SVK) [SVK],5,7,9,8,24,6,2,2,1,5,11,9,11,9,29
Slovenia (SLO),6,4,6,9,19,7,2,4,9,15,13,6,10,18,34
South Africa (RSA),18,23,26,27,76,6,0,0,0,0,24,23,26,27,76
Spain (ESP) [Z],22,37,59,35,131,19,1,0,1,2,41,38,59,36,133
Sri Lanka (SRI) [SRI],16,0,2,0,2,0,0,0,0,0,16,0,2,0,2
Sudan (SUD),11,0,1,0,1,0,0,0,0,0,11,0,1,0,1
Suriname (SUR) [E],11,1,0,1,2,0,0,0,0,0,11,1,0,1,2
Sweden (SWE) [Z],26,143,164,176,483,22,50,40,54,144,48,193,204,230,627
Switzerland (SUI),27,47,73,65,185,22,50,40,48,138,49,97,113,113,323
Syria (SYR),12,1,1,1,3,0,0,0,0,0,12,1,1,1,3
Chinese Taipei (TPE) [TPE] [TPE2],13,2,7,12,21,11,0,0,0,0,24,2,7,12,21
Tajikistan (TJK),5,0,1,2,3,4,0,0,0,0,9,0,1,2,3
Tanzania (TAN) [TAN],12,0,2,0,2,0,0,0,0,0,12,0,2,0,2
Thailand (THA),15,7,6,11,24,3,0,0,0,0,18,7,6,11,24
Togo (TOG),9,0,0,1,1,1,0,0,0,0,10,0,0,1,1
Tonga (TGA),8,0,1,0,1,1,0,0,0,0,9,0,1,0,1
Trinidad and Tobago (TRI) [TRI],16,2,5,11,18,3,0,0,0,0,19,2,5,11,18
Tunisia (TUN),13,3,3,4,10,0,0,0,0,0,13,3,3,4,10
Turkey (TUR),21,39,25,24,88,16,0,0,0,0,37,39,25,24,88
Uganda (UGA),14,2,3,2,7,0,0,0,0,0,14,2,3,2,7
Ukraine (UKR),5,33,27,55,115,6,2,1,4,7,11,35,28,59,122
United Arab Emirates (UAE),8,1,0,0,1,0,0,0,0,0,8,1,0,0,1
United States (USA) [P] [Q] [R] [Z],26,976,757,666,2399,22,96,102,84,282,48,1072,859,750,2681
Uruguay (URU),20,2,2,6,10,1,0,0,0,0,21,2,2,6,10
Uzbekistan (UZB),5,5,5,10,20,6,1,0,0,1,11,6,5,10,21
Venezuela (VEN),17,2,2,8,12,4,0,0,0,0,21,2,2,8,12
Vietnam (VIE),14,0,2,0,2,0,0,0,0,0,14,0,2,0,2
Virgin Islands (ISV),11,0,1,0,1,7,0,0,0,0,18,0,1,0,1
Yugoslavia (YUG) [YUG],16,26,29,28,83,14,0,3,1,4,30,26,32,29,87
Independent Olympic Participants (IOP) [IOP],1,0,1,2,3,0,0,0,0,0,1,0,1,2,3
Zambia (ZAM) [ZAM],12,0,1,1,2,0,0,0,0,0,12,0,1,1,2
Zimbabwe (ZIM) [ZIM],12,3,4,1,8,1,0,0,0,0,13,3,4,1,8
Mixed team (ZZX) [ZZX],3,8,5,4,17,0,0,0,0,0,3,8,5,4,17
Totals,27,4809,4775,5130,14714,22,959,958,948,2865,49,5768,5733,6078,17579

In [5]:
df = pd.read_csv('olympics.csv')
df.head()


Out[5]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
0 NaN № Summer 01 ! 02 ! 03 ! Total № Winter 01 ! 02 ! 03 ! Total № Games 01 ! 02 ! 03 ! Combined total
1 Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2
2 Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
3 Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
4 Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12

In [7]:
df = pd.read_csv('olympics.csv', index_col = 0, skiprows=1)
df.head()


Out[7]:
№ Summer 01 ! 02 ! 03 ! Total № Winter 01 !.1 02 !.1 03 !.1 Total.1 № Games 01 !.2 02 !.2 03 !.2 Combined total
Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12

In [9]:
df.columns


Out[9]:
Index(['№ Summer', '01 !', '02 !', '03 !', 'Total', '№ Winter', '01 !.1',
       '02 !.1', '03 !.1', 'Total.1', '№ Games', '01 !.2', '02 !.2', '03 !.2',
       'Combined total'],
      dtype='object')

In [9]:
for col in df.columns:
    if col[:2]=='01':
        df.rename(columns={col:'Gold' + col[4:]}, inplace=True)
    if col[:2]=='02':
        df.rename(columns={col:'Silver' + col[4:]}, inplace=True)
    if col[:2]=='03':
        df.rename(columns={col:'Bronze' + col[4:]}, inplace=True)
    if col[:1]=='№':
        df.rename(columns={col:'#' + col[1:]}, inplace=True) 

df.head()


Out[9]:
# Summer Gold Silver Bronze Total # Winter Gold.1 Silver.1 Bronze.1 Total.1 # Games Gold.2 Silver.2 Bronze.2 Combined total
Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12

Querying a DataFrame


In [11]:
df['Gold'] > 0


Out[11]:
Afghanistan (AFG)                               False
Algeria (ALG)                                    True
Argentina (ARG)                                  True
Armenia (ARM)                                    True
Australasia (ANZ) [ANZ]                          True
Australia (AUS) [AUS] [Z]                        True
Austria (AUT)                                    True
Azerbaijan (AZE)                                 True
Bahamas (BAH)                                    True
Bahrain (BRN)                                   False
Barbados (BAR) [BAR]                            False
Belarus (BLR)                                    True
Belgium (BEL)                                    True
Bermuda (BER)                                   False
Bohemia (BOH) [BOH] [Z]                         False
Botswana (BOT)                                  False
Brazil (BRA)                                     True
British West Indies (BWI) [BWI]                 False
Bulgaria (BUL) [H]                               True
Burundi (BDI)                                    True
Cameroon (CMR)                                   True
Canada (CAN)                                     True
Chile (CHI) [I]                                  True
China (CHN) [CHN]                                True
Colombia (COL)                                   True
Costa Rica (CRC)                                 True
Ivory Coast (CIV) [CIV]                         False
Croatia (CRO)                                    True
Cuba (CUB) [Z]                                   True
Cyprus (CYP)                                    False
                                                ...  
Sri Lanka (SRI) [SRI]                           False
Sudan (SUD)                                     False
Suriname (SUR) [E]                               True
Sweden (SWE) [Z]                                 True
Switzerland (SUI)                                True
Syria (SYR)                                      True
Chinese Taipei (TPE) [TPE] [TPE2]                True
Tajikistan (TJK)                                False
Tanzania (TAN) [TAN]                            False
Thailand (THA)                                   True
Togo (TOG)                                      False
Tonga (TGA)                                     False
Trinidad and Tobago (TRI) [TRI]                  True
Tunisia (TUN)                                    True
Turkey (TUR)                                     True
Uganda (UGA)                                     True
Ukraine (UKR)                                    True
United Arab Emirates (UAE)                       True
United States (USA) [P] [Q] [R] [Z]              True
Uruguay (URU)                                    True
Uzbekistan (UZB)                                 True
Venezuela (VEN)                                  True
Vietnam (VIE)                                   False
Virgin Islands (ISV)                            False
Yugoslavia (YUG) [YUG]                           True
Independent Olympic Participants (IOP) [IOP]    False
Zambia (ZAM) [ZAM]                              False
Zimbabwe (ZIM) [ZIM]                             True
Mixed team (ZZX) [ZZX]                           True
Totals                                           True
Name: Gold, dtype: bool

In [12]:
#boulean mask
only_gold = df.where(df['Gold'] > 0)
only_gold.head()


Out[12]:
# Summer Gold Silver Bronze Total # Winter Gold.1 Silver.1 Bronze.1 Total.1 # Games Gold.2 Silver.2 Bronze.2 Combined total
Afghanistan (AFG) NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Algeria (ALG) 12.0 5.0 2.0 8.0 15.0 3.0 0.0 0.0 0.0 0.0 15.0 5.0 2.0 8.0 15.0
Argentina (ARG) 23.0 18.0 24.0 28.0 70.0 18.0 0.0 0.0 0.0 0.0 41.0 18.0 24.0 28.0 70.0
Armenia (ARM) 5.0 1.0 2.0 9.0 12.0 6.0 0.0 0.0 0.0 0.0 11.0 1.0 2.0 9.0 12.0
Australasia (ANZ) [ANZ] 2.0 3.0 4.0 5.0 12.0 0.0 0.0 0.0 0.0 0.0 2.0 3.0 4.0 5.0 12.0

In [14]:
only_gold['Gold'].sum()


Out[14]:
9618.0

In [15]:
df['Gold'].count()


Out[15]:
147

In [16]:
only_gold = only_gold.dropna()
only_gold.head()


Out[16]:
# Summer Gold Silver Bronze Total # Winter Gold.1 Silver.1 Bronze.1 Total.1 # Games Gold.2 Silver.2 Bronze.2 Combined total
Algeria (ALG) 12.0 5.0 2.0 8.0 15.0 3.0 0.0 0.0 0.0 0.0 15.0 5.0 2.0 8.0 15.0
Argentina (ARG) 23.0 18.0 24.0 28.0 70.0 18.0 0.0 0.0 0.0 0.0 41.0 18.0 24.0 28.0 70.0
Armenia (ARM) 5.0 1.0 2.0 9.0 12.0 6.0 0.0 0.0 0.0 0.0 11.0 1.0 2.0 9.0 12.0
Australasia (ANZ) [ANZ] 2.0 3.0 4.0 5.0 12.0 0.0 0.0 0.0 0.0 0.0 2.0 3.0 4.0 5.0 12.0
Australia (AUS) [AUS] [Z] 25.0 139.0 152.0 177.0 468.0 18.0 5.0 3.0 4.0 12.0 43.0 144.0 155.0 181.0 480.0

In [17]:
only_gold = df[df['Gold'] > 0]
only_gold.head()


Out[17]:
# Summer Gold Silver Bronze Total # Winter Gold.1 Silver.1 Bronze.1 Total.1 # Games Gold.2 Silver.2 Bronze.2 Combined total
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12
Australia (AUS) [AUS] [Z] 25 139 152 177 468 18 5 3 4 12 43 144 155 181 480

In [18]:
len(df[(df['Gold'] > 0) | (df['Gold.1'] > 0)])


Out[18]:
101

In [19]:
df[(df['Gold.1'] > 0) & (df['Gold'] == 0)]


Out[19]:
# Summer Gold Silver Bronze Total # Winter Gold.1 Silver.1 Bronze.1 Total.1 # Games Gold.2 Silver.2 Bronze.2 Combined total
Liechtenstein (LIE) 16 0 0 0 0 18 2 2 5 9 34 2 2 5 9

Indexing Dataframes


In [10]:
df.head()


Out[10]:
# Summer Gold Silver Bronze Total # Winter Gold.1 Silver.1 Bronze.1 Total.1 # Games Gold.2 Silver.2 Bronze.2 Combined total
Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12

In [11]:
# if changing index, he index column will be deleted so create a new column for the old index country
df['country'] = df.index
df = df.set_index('Gold')
df.head()


Out[11]:
# Summer Silver Bronze Total # Winter Gold.1 Silver.1 Bronze.1 Total.1 # Games Gold.2 Silver.2 Bronze.2 Combined total country
Gold
0 13 0 2 2 0 0 0 0 0 13 0 0 2 2 Afghanistan (AFG)
5 12 2 8 15 3 0 0 0 0 15 5 2 8 15 Algeria (ALG)
18 23 24 28 70 18 0 0 0 0 41 18 24 28 70 Argentina (ARG)
1 5 2 9 12 6 0 0 0 0 11 1 2 9 12 Armenia (ARM)
3 2 4 5 12 0 0 0 0 0 2 3 4 5 12 Australasia (ANZ) [ANZ]

In [ ]:
df = df.reset_index()
df.head()

In [ ]:
df = pd.read_csv('census.csv')
df.head()

In [ ]:
df['SUMLEV'].unique()

In [ ]:
df=df[df['SUMLEV'] == 50]
df.head()

In [ ]:
columns_to_keep = ['STNAME',
                   'CTYNAME',
                   'BIRTHS2010',
                   'BIRTHS2011',
                   'BIRTHS2012',
                   'BIRTHS2013',
                   'BIRTHS2014',
                   'BIRTHS2015',
                   'POPESTIMATE2010',
                   'POPESTIMATE2011',
                   'POPESTIMATE2012',
                   'POPESTIMATE2013',
                   'POPESTIMATE2014',
                   'POPESTIMATE2015']
df = df[columns_to_keep]
df.head()

In [ ]:
df = df.set_index(['STNAME', 'CTYNAME'])
df.head()

In [ ]:
df.loc['Michigan', 'Washtenaw County']

In [ ]:
df.loc[ [('Michigan', 'Washtenaw County'),
         ('Michigan', 'Wayne County')] ]

Missing values


In [ ]:
df = pd.read_csv('log.csv')
df

In [ ]:
df.fillna?

In [ ]:
df = df.set_index('time')
df = df.sort_index()
df

In [ ]:
df = df.reset_index()
df = df.set_index(['time', 'user'])
df

In [ ]:
df = df.fillna(method='ffill')
df.head()